import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')
# set figure size and figure format
plt.rcParams['figure.figsize'] = [8, 6]
%config InlineBackend.figure_format = 'svg'
# to keep the plot in fixed, so that I don't have to scroll
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines){
return false;
}
df = pd.read_csv('data/prosperLoanData.csv')
df.head(6)
df.shape
What is the structure of your dataset?
There are 113937 observation and 81 variables.
What is/are the main feature(s) of interest in your dataset?
There are so many variables in this dataset. Looking only variables name it is hard to find what is most interesting. After some analysis I found that some features are more interesting than others. Through out the project I will work with Borrower Rate, Borrower APR, Prosper Score, Credit Score, Original Loan Amount, Monthly Payment, Borrower Occupation, Borrower State, Borrower Employment Status and some other feature if it seems interest later on.
What features in the dataset do you think will help support your investigation into your feature(s) of interest?
I will look into how Borrower rate and APR changes over time and what make borrower a defaulter. For that I think Loan Origination Date, Borrower Rate and APR, Prosper Score, Income Range, Employment Status and some others feature will help to investigate my feature of interest.
base_color = sns.color_palette()[0]
# Ocupation bar plot
df.Occupation.value_counts().head(20).plot(kind = 'bar', color = base_color);
plt.xlabel('Occupation')
plt.ylabel('Count');
plt.title('Occupation Types Count');
Other is the highest category occupation that borrow money from prosper. others notable category are Professional and Computer Programmer.
# Prosper Score bar plot
base_color = sns.color_palette()[0]
df['ProsperScore'].value_counts().plot(kind = 'bar', color = base_color);
plt.xlabel('Prosper Score')
plt.ylabel('Count')
plt.title("Prosper Score Count");
A majority of borrower have prosper score either 4,6, or 8.
prosperscore = df['ProsperScore'].dropna()
level_order = ['1.0','2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0']
order_cat = pd.api.types.CategoricalDtype(ordered = True, categories=level_order)
prosperscore = prosperscore.astype(order_cat)
# color = '#099DD9' can be use , good one
# color_palette_list = ['#009ACD', '#ADD8E6', '#63D1F4', '#0EBFE9', '#C1F0F6', '#0099CC']
# prosper score count plot
#base_color = sns.color_palette()[9]
# 'darkturquoise'
sns.countplot(data = df, x = 'ProsperScore', color = base_color);
plt.xticks(rotation = 0)
plt.xlabel('Prosper Score')
# plt.style.use()
plt.ylabel('Count');
# color = '#099DD9' can be use , good one
# prosper score count plot
#base_color = sns.color_palette()[]
sns.countplot(data = df, x = 'ProsperScore', color = '#099DD9' );
plt.xticks(rotation = 0)
plt.xlabel('Prosper Score')
plt.ylabel('Count');
plt.title('Prosper Score Count');
df['BorrowerRate'] =df['BorrowerRate'].apply(lambda x: x *100)
# Borrower Rate
ticks =np.arange(0, 50 ,5)
labels = ['{}'.format(v) for v in ticks]
bins = np.arange(0, df['BorrowerRate'].max()+5, 5)
plt.hist(data = df, x = 'BorrowerRate', bins = bins);
plt.xticks(ticks, labels);
plt.xticks(rotation = 90);
plt.xlabel('Borrower Rate')
plt.ylabel('Count')
plt.title("Borrower Rate Count");
most people have borrower rate between 5 to 35%. Mean is 18.4%.
df['BorrowerRate'].describe()
df['BorrowerAPR'].describe()
# Borrower APR
ticks =list(np.arange(0, 0.5, 0.025))
labels = ['{:.2f}'.format(v) for v in ticks]
bins = np.arange(0, df['BorrowerAPR'].max()+.02, .02)
plt.hist(data = df, x = 'BorrowerAPR', bins = bins);
plt.xticks(ticks, labels);
plt.xticks(rotation = 90);
plt.xlabel('Borrower APR')
plt.ylabel('Count')
plt.title("Borrower APR Count");
mean borrower APR around 0.20. One interesting thing is that lots of people have APR 0.35.
#create new data frame with employment status
employment = df['EmploymentStatus']
#convert series to dataframe
employment = employment.to_frame()
# drop NA values
employment.dropna(inplace= True)
employ_count = employment['EmploymentStatus'].value_counts()
n_employment = employment.shape[0]
max_employ_count = employment['EmploymentStatus'].value_counts()[0]
max_prop = max_employ_count/n_employment
print(max_prop)
np.arange(0, max_prop, 0.05)
tick_prop = np.arange(0, max_prop, 0.05)
tick_name = ['{:0.2f}'.format(v) for v in tick_prop]
type_order = employ_count.index
sns.countplot(data = employment, y = 'EmploymentStatus', color = base_color,
order = type_order)
plt.xticks(tick_prop * n_employment, tick_name)
plt.xlabel('Proportion')
for i in range(employ_count.shape[0]):
count = employ_count[i]
pct_string = '{:0.1f}%'.format(100 * count/ n_employment)
plt.text(count+1, i, pct_string, va = 'center')
sns.countplot(data = df, x = 'EmploymentStatus', color = base_color, order=type_order);
plt.xticks(rotation = 15);
plt.xlabel('Employment Status')
plt.ylabel('Count');
plt.title('Employment Status Count');
Most people is either employed or full-time.
# pie chart
plt.figure(figsize=(6,4))
sorted_count = df['IsBorrowerHomeowner'].value_counts()
plt.pie(sorted_count, labels= sorted_count.index, startangle=90, counterclock=False)
plt.axis('square');
plt.title('Is Borrower Home Owner');
half of the people home owner and other half not
df['IsBorrowerHomeowner'].value_counts()
# is borrower home owner
plt.figure(figsize=(6,4))
sns.countplot( data = df, x = 'IsBorrowerHomeowner');
plt.xlabel("Home Owner")
plt.ylabel("Count")
plt.title("Is Borrower Home Owner");
df['IncomeRange'].value_counts()
type_order_income = df['IncomeRange'].value_counts().index
sns.countplot(data = df, x = 'IncomeRange', color = base_color, order = type_order_income)
plt.xticks(rotation = 90);
plt.xlabel('Income Range')
plt.ylabel('Count')
plt.title('Income Range Count');
Most people have income form 25000 to 75000 dollar.
# lender yield
ticks =list(np.arange(0, 0.5, 0.025))
labels = ['{:.2f}'.format(v) for v in ticks]
bins = np.arange(0, df['LenderYield'].max()+.02, .02)
plt.hist(data = df, x = 'LenderYield', bins = bins);
plt.xticks(ticks, labels);
plt.xticks(rotation = 90);
plt.xlabel('Lender Yield')
plt.ylabel('Count')
plt.title('Lender Yield Histogram');
Lender yield is normally distribute and this plot quite similar to borrower APR
# total inquery , limit to 20 value
order_type_inq = df['TotalInquiries'].value_counts().head(20).index
# use order type for highest to lowest
plt.figure(figsize=(8,6))
sns.countplot(data =df, x = 'TotalInquiries', order = order_type_inq, color=base_color);
plt.xticks(rotation = 15);
plt.xlabel('Total Inquiries')
plt.ylabel('Count')
plt.title('Total Inquiries Count');
most people have inquiries less than 6. highest inquiries is 2.
# no order type, use x limit to take only first 20 value,
plt.figure(figsize=(8,6))
sns.countplot(data =df, x = 'TotalInquiries', color= base_color);
plt.xticks(rotation = 20);
plt.xlim((-1,20.5))
plt.xlabel('Total Inquiries')
plt.ylabel('Count')
plt.title('Total Inquiries Count');
ticks =np.arange(0, 36000 ,2000)
labels = ['{}'.format(v) for v in ticks]
bins = np.arange(df['LoanOriginalAmount'].min(), df['LoanOriginalAmount'].max()+1000, 1000)
plt.hist(data = df, x = 'LoanOriginalAmount', bins = bins);
plt.xticks(ticks, labels);
plt.xticks(rotation = 90);
plt.xlabel('Loan Original Amount')
plt.ylabel('Count')
plt.title('Loan Original Amount Count');
# order from highest count to lowest count
df['LoanOriginalAmount'].value_counts().head(20).plot(kind = 'bar', color = base_color);
plt.xlabel('Loan Original Amount')
plt.ylabel('Count')
plt.title('Loan Original Amount Count');
Notable amount that borrowed are 4000, 15000, and 10000 dollar.
# cut continuous to discrete variable
range_bin = np.arange(1000,df['LoanOriginalAmount'].max() + 1000, 1000)
bins = pd.cut(df['LoanOriginalAmount'], range_bin)
loan_original = pd.value_counts(bins)
# # order from highest count to lowest count
plt.figure(figsize= (12,8))
loan_original.plot(kind = 'bar', color = base_color);
plt.xlabel('Loan Original Amount')
plt.ylabel('Count')
plt.title('Loan Original Amount Count');
# order from lowest amount to highest amount
loan_original = pd.value_counts(bins).sort_index()
plt.figure(figsize= (12,8))
loan_original.plot(kind = 'bar', color = base_color);
plt.xlabel('Loan Original Amount')
plt.ylabel('Count')
plt.title('Loan Original Amount Count');
# histogram
bins = np.arange(400, 900, 20)
df['CreditScoreRangeLower'].plot(kind = 'hist', bins = bins)
plt.xlim((400, 900))
plt.xlabel('Credit Score Range Lower')
plt.title('Credit Score Range Lower Frequency');
mean credit score was 680.
# Credit Score Range Upper histogram
bins = np.arange(400, 900, 20)
df['CreditScoreRangeUpper'].plot(kind = 'hist', bins = bins)
plt.xlim((400, 1000))
plt.xlabel('Credit Score Range Upper')
plt.title('Credit Score Range Upper Frequency');
mean credit score was 700.
df[['CreditScoreRangeLower','CreditScoreRangeUpper', 'FirstRecordedCreditLine',
'CurrentCreditLines', 'RevolvingCreditBalance', 'BankcardUtilization',
'StatedMonthlyIncome', 'TotalProsperLoans', 'OnTimeProsperPayments',
'LoanOriginationDate', 'LoanOriginationQuarter', 'MonthlyLoanPayment',
'InvestmentFromFriendsCount', 'Investors']].describe()
# Monthly payment histogram
bins = np.arange(0, 1300, 50)
df['MonthlyLoanPayment'].plot(kind = 'hist', bins = bins)
plt.xlim((0,1200))
plt.xlabel('Monthly Payment Amount')
plt.title('Monthly Payment Frequency');
Most people have monthly payment less than 400. But some people have monthly payment as high as 1000 dollar.
# Investor count, highest 20 shown
df['Investors'].value_counts().head(20).plot(kind = 'bar', color = base_color);
plt.xlabel('Number of Investor')
plt.ylabel('Count')
plt.title('Number of Investor Count');
number of investor only one is most notable.
# Borrower State count
plt.figure(figsize=(12,8))
df['BorrowerState'].value_counts().plot(kind = 'bar', color = base_color);
plt.xlabel('Borrower State')
plt.ylabel('Count')
plt.title('Borrower State Count');
number of borrower count in California is highest followed by Texas, New York and Florida.
# cut continuous into discrete value
ratio_bin = np.arange(0,2,0.1)
bins = pd.cut(df['DebtToIncomeRatio'], ratio_bin)
debt_ratio = pd.value_counts(bins)
plt.figure(figsize= (12,8))
debt_ratio.plot(kind = 'bar', color = base_color);
plt.xlabel('Debt To Income Ratio')
plt.ylabel('Count')
plt.title('Debt To Income Ratio Count');
most debt to income ratio in between 0.1 to 0.2.
df['DebtToIncomeRatio'].describe()
# cut continuous into discrete value
revol_bin = np.arange(0,df['RevolvingCreditBalance'].max(),5000)
bins = pd.cut(df['RevolvingCreditBalance'], revol_bin)
revol_balance = pd.value_counts(bins).head(20).sort_index()
plt.figure(figsize= (12,8))
revol_balance.plot(kind = 'bar', color = base_color);
plt.xlabel('Revolving Credit Balance')
plt.ylabel('Count')
plt.title('Revolving Credit Balance Count');
df['RevolvingCreditBalance'].describe()
df['LoanStatus'].value_counts().plot(kind = 'bar', color = base_color)
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.title('Loan Status Count');
most people are in current borrower followed by who completed.
Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?
when I plotted monthly payment in histogram I observed that it right skewed. I did not perform any transformation because at this point I was just investigating the distribution. Although I observed that number of investor in most cases only one. California is the highest Borrower state, that make sense because population is higher.
Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?
When I was investigating loan original amount I found that rather than making histogram it is more appropriate to plot as bar plot. So I make transform the feature into amount range.
# since main one contain lots of variable , make new data frame from main one
df_bi = df.loc[ :, ('LoanStatus',
'BorrowerAPR',
'BorrowerRate',
'LenderYield',
'ProsperScore',
'BorrowerState',
'Occupation',
'EmploymentStatus',
'IsBorrowerHomeowner',
'CreditScoreRangeLower',
'CreditScoreRangeUpper',
'RevolvingCreditBalance',
'BankcardUtilization','DebtToIncomeRatio',
'IncomeRange','LoanMonthsSinceOrigination',
'LoanOriginalAmount',
'LoanOriginationDate',
'MonthlyLoanPayment',
'LoanOriginationQuarter',
'Investors'
)]
# create subset dataframe "loan_origin" for time series plot
loan_origin = df_bi.loc[:, ('LoanOriginationDate','LoanOriginalAmount')]
loan_origin.isnull().sum()
# conver to date time
loan_origin['LoanOriginationDate'] = pd.to_datetime(loan_origin['LoanOriginationDate'])
# make date time index column
loan_origin.set_index('LoanOriginationDate', inplace=True)
# to avoid over plotting, will resample form original,
# look at monthly data
monthly = loan_origin.resample('M').sum()
monthly.plot();
plt.xlabel('Loan Origination Month')
plt.ylabel('Loan Original Amount')
plt.title('Loan Origination Month vs Loan Original Amount');
Loan amount increased over time except in 2009. Decreased in 2009 is may be for financial crisis.
# plot daily data
daily = loan_origin.resample('D').sum()
daily.plot()
plt.xlabel('Loan Origination Date')
plt.ylabel('Loan Original Amount')
plt.title('Loan Origination date vs Loan Original Amount');
# using moving average form smoother plot
daily.rolling(50, center=True, win_type='gaussian').sum(std = 10).plot()
# keep only year value
df_bi['LoanOriginationYear'] = pd.to_datetime(df_bi['LoanOriginationDate']).dt.year
df_bi.head()
# boxplot
sns.boxplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerRate', color = base_color)
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year vs Borrower Rate');
borrower rate increased from 2008 to 2011 and then decreased after that.
# same as above, only violin plot used
sns.violinplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerRate', color = base_color);
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year vs Borrower Rate');
# same as above, only different type plot
sns.pointplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerRate', color = base_color)
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year vs Borrower Rate');
sns.boxplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerAPR', color = base_color);
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower APR')
plt.title('Loan Origination Year vs Borrower APR');
borrower APR increased from 2008 to 2011 and then decreased after that.
sns.boxplot(data = df_bi, x = 'LoanOriginationYear',
y = 'ProsperScore', color = base_color);
plt.xlabel('Loan Origination Year')
plt.ylabel('Prosper Score')
plt.title('Loan Origination Year vs Prosper Score');
Mean Prosper score changes over the time.
sns.boxplot(data = df_bi, y = 'LoanOriginalAmount',
x = 'EmploymentStatus',
color = base_color);
plt.xlabel('Employment Status')
plt.ylabel('Loan Original Amount')
plt.title('Loan Original Amount vs Employment Status');
plt.xticks(rotation = 15);
Mean loan amount was highest in employed group.
sns.boxplot(data = df_bi, y = 'ProsperScore',
x = 'EmploymentStatus',
color = base_color);
plt.xlabel('Employment Status')
plt.ylabel('Prosper Score')
plt.title('Employment Status vs Prosper Score');
plt.xticks(rotation = 15);
Mean Prosper score was highest in full-time employment status category.
sns.boxplot(data = df_bi, y = 'BorrowerAPR',
x = 'EmploymentStatus', color = base_color);
plt.xlabel('Employment Status')
plt.ylabel('Borrower APR')
plt.title('Employent Status vs Borrower APR');
plt.xticks(rotation = 15);
Borrower APR was highest in not employed group.
df_bi.groupby('EmploymentStatus')['BorrowerAPR'].mean()
sns.boxplot(data = df_bi, y = 'BorrowerRate',
x = 'EmploymentStatus',
color = base_color);
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employent Status vs Borrower Rate');
plt.xticks(rotation = 15);
Borrower rate was highest in not employed group.
df_bi.groupby('EmploymentStatus')['BorrowerRate'].mean()
sns.boxplot(data = df_bi, y = 'BorrowerAPR',
x = 'IncomeRange',
color = base_color);
plt.xlabel('Income Range')
plt.ylabel('Borrower APR')
plt.title('Income Range vs Borrower APR');
plt.xticks(rotation = 15);
Borrower APR was highest for not employed, and lowest for who earn more than 100k.
df_bi.groupby('IncomeRange')['BorrowerAPR'].mean()
sns.boxplot(data = df_bi, y = 'BorrowerRate',
x = 'IncomeRange',
color = base_color);
plt.xlabel('Income Range')
plt.ylabel('Borrower Rate')
plt.title('Income Range vs Borrower Rate');
plt.xticks(rotation = 15);
Borrower Rate was highest for not employed, and lowest for who earn more than 100k.
# those are current, completed and cancelled are categorize as completed otherwise defaulted
df_bi['Status'] = np.where((df_bi['LoanStatus'] == 'Current') |
(df_bi['LoanStatus'] == 'Completed')|
(df_bi['LoanStatus'] == 'Cancelled'),
'Completed', 'Defaulted')
df_bi.head()
# bar diagram to count status
sns.countplot(data = df_bi, x = 'Status');
plt.xlabel('Status')
plt.ylabel('Count')
plt.title('Status Count');
order_type = df_bi['Occupation'].value_counts().index
plt.figure(figsize=(8,12))
sns.countplot(data = df_bi, y = 'Occupation', hue = 'Status', order = order_type);
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.title('Status vs Occupataion');
order_type = df_bi['Occupation'].value_counts().index[2:20]
plt.figure(figsize=(8,12))
sns.countplot(data = df_bi, y = 'Occupation', hue = 'Status', order = order_type);
plt.xlabel('Count')
plt.ylabel('Occupation')
plt.title('Status vs Occupataion');
hard to tell from first plot if there any group that defaulted most. from second plot, it seems Clerical group defaulted rate is higher than any other group.
sns.countplot(data = df_bi, x = 'EmploymentStatus', hue = 'Status');
plt.xlabel('Employment Status')
plt.ylabel('Count')
plt.title('Status vs Employment Status');
plt.xticks(rotation = 15);
full time group defaulted notably.
sns.countplot(data = df_bi, x = 'IncomeRange', hue = 'Status');
plt.xlabel('Income Range')
plt.ylabel('Count')
plt.title('Status vs Income Range');
plt.xticks(rotation = 15);
Proportion of defaulter higher for not displayed group followed by 1-24999 dollar group.
order_type_state = df_bi['BorrowerState'].value_counts().index
plt.figure(figsize=(8,12))
sns.countplot(data = df_bi, y = 'BorrowerState', hue = 'Status', order = order_type_state);
plt.xlabel('Count')
plt.ylabel('Borrower State')
plt.title('Status vs Borrower State')
plt.legend(loc = 1);
Hard to distinguish if there any state that defaulted most.
sns.countplot(data = df_bi, x = 'ProsperScore', hue = 'Status');
plt.xlabel('Prosper Score')
plt.ylabel('Count')
plt.title('Status vs Prosper Score');
#plt.figure(figsize=(8,12))
order_type = df_bi['ProsperScore'].value_counts().index
sns.countplot(data = df_bi, x = 'ProsperScore', hue = 'Status', order = order_type);
plt.xlabel('Prosper Score')
plt.ylabel('Count')
plt.title('Status vs Prosper Score');
score 1 is quite alarming. They have around 50 % chance being defaulted.
# masking
completed = df_bi['Status'] == 'Completed'
defaulted = df_bi['Status'] == 'Defaulted'
df['BorrowerRate'][completed].hist(label = 'Completed', alpha = 0.5, bins = 30)
df['BorrowerRate'][defaulted].hist(label = 'Defaulted', alpha = 0.5, bins = 30)
plt.xlabel('Borrower Rate')
plt.ylabel('Count')
plt.title('Status vs Borrower Rate');
plt.legend();
when borrower rate gets higher chance of being defaulted gets higher.
completed = df_bi['Status'] == 'Completed'
defaulted = df_bi['Status'] == 'Defaulted'
df['BorrowerAPR'][completed].hist(label = 'Completed', alpha = 0.5, bins = 30)
df['BorrowerAPR'][defaulted].hist(label = 'Defaulted', alpha = 0.5, bins = 30)
plt.xlabel('Borrower APR')
plt.ylabel('Frequency')
plt.title('Status vs Borrower APR');
plt.legend();
in general, as APR increase number of defaulter increases.
completed = df_bi['Status'] == 'Completed'
defaulted = df_bi['Status'] == 'Defaulted'
bins = np.arange(400, 900, 20)
df['CreditScoreRangeLower'][completed].plot(kind = 'hist', label= 'Completed', bins = bins, alpha = 0.5)
df['CreditScoreRangeLower'][defaulted].plot(kind = 'hist', label= 'Defaulted' ,bins = bins,alpha = 0.5)
plt.xlabel('Credit Score Range Lower')
plt.ylabel('Frequency')
plt.title('Status vs Credit Score Range Lower');
plt.legend()
plt.xlim((400, 900));
completed = df_bi['Status'] == 'Completed'
defaulted = df_bi['Status'] == 'Defaulted'
bins = np.arange(400, 900, 20)
df['CreditScoreRangeUpper'][completed].plot(kind = 'hist', label= 'Completed', bins = bins, alpha = 0.5)
df['CreditScoreRangeUpper'][defaulted].plot(kind = 'hist', label= 'Defaulted' ,bins = bins,alpha = 0.5)
plt.xlabel('Credit Score Range Upper')
plt.ylabel('Frequency')
plt.title('Status vs Credit Score Range Upper');
plt.legend()
plt.xlim((400, 900));
with credit score less than 600 get defaulted highest.
df_bi.shape[0]
# set seed
np.random.seed(2018)
# sample dataframe
sample = np.random.choice(df_bi.shape[0], 10000, replace= False)
df_subset = df_bi.loc[sample]
df_subset.head()
plt.figure(figsize=(8,6))
sns.regplot(data = df_subset, x = 'ProsperScore', y = 'BorrowerRate',
x_jitter= 0.3,scatter_kws={'alpha':1/50});
plt.xlabel('Prosper Score')
plt.ylabel('Borrower Rate')
plt.title('Prosper Score vs Borrower Rate');
# quite same as above, only different type plot used
plt.figure(figsize=(8,6))
sns.stripplot(data = df_subset, x = 'ProsperScore', y = 'BorrowerRate',jitter = .2);
plt.xlabel('Prosper Score')
plt.ylabel('Borrower Rate')
plt.title('Prosper Score vs Borrower Rate');
Borrower rate is negatively correlated with Prosper Score.
Skip following plot.
plt.figure(figsize=(8,6))
sns.catplot(x="EmploymentStatus", y="BorrowerRate", data=df_subset, kind='boxen');
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employment Status vs Borrower Rate');
plt.xticks(rotation = 15);
heat = ['BorrowerAPR','BorrowerRate','LenderYield', 'ProsperScore', 'CreditScoreRangeLower',
'CreditScoreRangeUpper', 'RevolvingCreditBalance','BankcardUtilization', 'DebtToIncomeRatio',
'LoanOriginalAmount']
sns.heatmap(df_subset[heat].corr());
sns.heatmap(df_subset[heat].corr(), cmap = 'rocket_r', annot = True, fmt = '.2f');
Borrower rate and Borrower APR is negatively correlated with Prosper Score.
# is borrower home owner
plt.figure(figsize=(6,4))
sns.countplot( data = df_bi, x = 'IsBorrowerHomeowner', hue = 'Status');
plt.xlabel("Home Owner")
plt.ylabel("Count")
plt.title("Is Borrower Home Owner");
Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?
Original loan amount was lowest just after 2008 financial crisis, after that the amount is increasing over the year. From 2013 to 2014 , amount gets really big. Borrower Rate and APR increased until 2011 then decreasing. Higher Income people gets lower borrower rate and APR.
Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?
It is hard to tell whether one will be defaulted knowing his/her occupation, State and whether home owner or not.
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
#bins = np.arange(0, 35000, 1000)
#g = sns.FacetGrid(data = df_subset, col = 'EmploymentStatus', col_wrap= 3);
#g.map(plt.hist, 'LoanOriginalAmount', bins = bins)
# %%script false
# skip this, only for future reference for myself
#g = sns.FacetGrid(df_subset, col="EmploymentStatus",col_wrap = 3)
#g.map(sns.boxplot, "LoanOriginationYear", "CreditScoreRangeUpper");
#%%script false
#g = sns.FacetGrid(df_subset, col="IncomeRange",col_wrap = 3, hue = 'Status' )
#g.map(sns.scatterplot, "BorrowerRate", "ProsperScore", alpha = 0.3);
sns.boxplot(data = df_bi, y = 'BorrowerRate',
x = 'EmploymentStatus', hue = "Status");
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employment Status and Borrower Rate vs Status');
plt.xticks(rotation = 15);
# same as above only different type plot
sns.pointplot(data = df_bi, y = 'BorrowerRate',
x = 'EmploymentStatus', hue = "Status");
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employment Status and Borrower Rate vs Status');
plt.xticks(rotation = 15);
# same as above only different type plot
sns.pointplot(data = df_bi, y = 'BorrowerRate',
x = 'EmploymentStatus', hue = "Status", dodge = True, linestyles = "", ci = 'sd');
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employment Status and Borrower Rate vs Status');
plt.xticks(rotation = 15);
# same as above only different type plot
sns.barplot(data = df_bi, y = 'BorrowerRate',
x = 'EmploymentStatus', hue = "Status", ci = 'sd');
plt.xlabel('Employment Status')
plt.ylabel('Borrower Rate')
plt.title('Employment Status and Borrower Rate vs Status');
plt.xticks(rotation = 15);
Borrower rate is higher for defaulter in each group.
sns.pointplot(data = df_bi, y = 'BorrowerAPR',
x = 'EmploymentStatus', hue = "Status", dodge = True, linestyles = "", ci = 'sd');
plt.xlabel('Employment Status')
plt.ylabel('Borrower APR')
plt.title('Employment Status and Borrower APR vs Status');
plt.xticks(rotation = 15);
Borrower APR is higher for defaulter in each group.
sns.boxplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerRate', hue = "Status", dodge = "True");
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year and Borrower Rate vs Status');
sns.pointplot(data = df_bi, x = 'LoanOriginationYear',
y = 'BorrowerRate', hue = "Status");
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year and Borrower Rate vs Status');
Borrower rate is higher for defaulter in each Year.
# just differernt type plot
sns.scatterplot(data = df_subset, x = 'LoanOriginationYear', y = 'BorrowerRate',
hue = "Status",alpha = 0.3);
plt.xlabel('Loan Origination Year')
plt.ylabel('Borrower Rate')
plt.title('Loan Origination Year and Borrower Rate vs Status');
sns.scatterplot(data = df_subset, x = 'CreditScoreRangeUpper', y = 'BorrowerRate',
hue = "Status",alpha = 0.3);
plt.xlabel('Credit Score Range Upper')
plt.ylabel('Borrower Rate')
plt.title('Credit Score Range Upper and Borrower Rate vs Status');
plt.xlim((400, 900));
when credit score is lower, they get higher borrower rate and defaulted most.
END
df_subset.dropna(inplace = True)
# skip
#plt.hist2d(data = df_subset, x = 'ProsperScore', y = 'BorrowerRate', cmin =0.5, cmap = 'viridis_r')
#plt.colorbar()
#plt.xlabel('Prosper score')
#plt.ylabel('borrower rate');
This is only for myself. Skip this
Univariate Summary:
bar plot, count plot
(pie chart can be used in bar chart)
histogram
Bivariate:
boxplot, violin plot, point plot, catplot
clustered bar plot (countplot, bar plot)
overlap histogram
regplot , striplot
scatter plot
heatmap
multivaraite:
clustered box plot, violin plot,point plot, catplot
clustered bar plot (countplot, bar plot)
scatter plot